Import all Libraries

In [116]:
import warnings
warnings.filterwarnings("ignore")

# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Library to split data
from sklearn.model_selection import train_test_split

# Libraries to help with model building
from sklearn.linear_model import LogisticRegression

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Removes the limit from the number of displayed columns and rows.
pd.set_option("display.max_columns", None)
# pd.set_option('display.max_rows', None)
pd.set_option("display.max_rows", 200)

import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo


# for statistical analysis 
import statsmodels.stats.api as sms
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
# To help with reading and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# To be used for missing value imputation
from sklearn.impute import SimpleImputer

# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    plot_confusion_matrix,
)

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
import warnings

warnings.filterwarnings("ignore")

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
In [117]:
pip install plotly
^C
Note: you may need to restart the kernel to use updated packages.

Load Data

In [119]:
data = pd.read_csv('BankChurners.csv')
#data = data[data.columns[:-2]]
data.head(5) # Check first 5 rows
Out[119]:
CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count Education_Level Marital_Status Income_Category Card_Category Months_on_book Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
0 768805383 Existing Customer 45 M 3 High School Married $60K - $80K Blue 39 5 1 3 12691.000 777 11914.000 1.335 1144 42 1.625 0.061
1 818770008 Existing Customer 49 F 5 Graduate Single Less than $40K Blue 44 6 1 2 8256.000 864 7392.000 1.541 1291 33 3.714 0.105
2 713982108 Existing Customer 51 M 3 Graduate Married $80K - $120K Blue 36 4 1 0 3418.000 0 3418.000 2.594 1887 20 2.333 0.000
3 769911858 Existing Customer 40 F 4 High School NaN Less than $40K Blue 34 3 4 1 3313.000 2517 796.000 1.405 1171 20 2.333 0.760
4 709106358 Existing Customer 40 M 3 Uneducated Married $60K - $80K Blue 21 5 1 0 4716.000 0 4716.000 2.175 816 28 2.500 0.000
In [120]:
data.tail(5) # Check last 5 rows
Out[120]:
CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count Education_Level Marital_Status Income_Category Card_Category Months_on_book Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
10122 772366833 Existing Customer 50 M 2 Graduate Single $40K - $60K Blue 40 3 2 3 4003.000 1851 2152.000 0.703 15476 117 0.857 0.462
10123 710638233 Attrited Customer 41 M 2 NaN Divorced $40K - $60K Blue 25 4 2 3 4277.000 2186 2091.000 0.804 8764 69 0.683 0.511
10124 716506083 Attrited Customer 44 F 1 High School Married Less than $40K Blue 36 5 3 4 5409.000 0 5409.000 0.819 10291 60 0.818 0.000
10125 717406983 Attrited Customer 30 M 2 Graduate NaN $40K - $60K Blue 36 4 3 3 5281.000 0 5281.000 0.535 8395 62 0.722 0.000
10126 714337233 Attrited Customer 43 F 2 Graduate Married Less than $40K Silver 25 6 2 4 10388.000 1961 8427.000 0.703 10294 61 0.649 0.189

Exploratory Data analysis

Let us perform Univariate, Bivariate and multi variate analysis and get some insights

In [100]:
fig = make_subplots(rows=2, cols=1)

i1=go.Box(x=data['Customer_Age'],name='Age Box Plot',boxmean=True)
i2=go.Histogram(x=data['Customer_Age'],name='Age Histogram')

fig.add_trace(i1,row=1,col=1)
fig.add_trace(i2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of Customer Ages")
fig.show()

Age variable is nearly normsl and has mean of 46.3 and median of 46

In [101]:
fig = make_subplots(
    rows=2, cols=2,subplot_titles=('','<b>Platinum Card Holders','<b>Blue Card Holders<b>','Residuals'),
    vertical_spacing=0.09,
    specs=[[{"type": "pie","rowspan": 2}       ,{"type": "pie"}] ,
           [None                               ,{"type": "pie"}]            ,                                      
          ]
)

fig.add_trace(
    go.Pie(values=data.Gender.value_counts().values,labels=['<b>Female<b>','<b>Male<b>'],hole=0.3,pull=[0,0.3]),
    row=1, col=1
)

fig.add_trace(
    go.Pie(
        labels=['Female Platinum Card Holders','Male Platinum Card Holders'],
        values=data.query('Card_Category=="Platinum"').Gender.value_counts().values,
        pull=[0,0.05,0.5],
        hole=0.3
        
    ),
    row=1, col=2
)

fig.add_trace(
    go.Pie(
        labels=['Female Blue Card Holders','Male Blue Card Holders'],
        values=data.query('Card_Category=="Blue"').Gender.value_counts().values,
        pull=[0,0.2,0.5],
        hole=0.3
    ),
    row=2, col=2
)
fig.update_layout(
    height=800,
    showlegend=True,
    title_text="<b>Distribution Of Gender And Different Card Statuses<b>",
)

fig.show()

If we check gender across various cards

Females were more card holders across both types of cards. 52.9% of card holders were female

55% of Platinum were held by female and 54.1% of Blue card holder were female

In [102]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=data['Dependent_count'],name='Dependent count Box Plot',boxmean=True)
tr2=go.Histogram(x=data['Dependent_count'],name='Dependent count Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of Dependent counts (close family size)")
fig.show()

Distribution of dependents is slightly right skewed, with mean of 2.3 and median of 2

In [103]:
ex.pie(data,names='Education_Level',title='Propotion Of Education Levels',hole=0.33)

30.1 % of customers wee graduate, 19.9% were High school graduates and 14.7% uneducated

In [104]:
ex.pie(data,names='Marital_Status',title='Propotion Of Different Marriage Statuses',hole=0.33)

46.3 % of customers were married and 38.9 % were single

In [9]:
ex.pie(data,names='Income_Category',title='Propotion Of Different Income Levels',hole=0.33)

35.2% of people had income less than 40k and 17.7% had income of 40k

In [10]:
ex.pie(data,names='Card_Category',title='Propotion Of Different Card Categories',hole=0.33)

93.2% of credit cards were Blue cards, followed by silver card holder of 5.4 %

In [11]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=data['Months_on_book'],name='Months on book Box Plot',boxmean=True)
tr2=go.Histogram(x=data['Months_on_book'],name='Months on book Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of months the customer is part of the bank")
fig.show()

most of customers with Bank for a mean time of 36 month. Timeperiod the customers with Bank is fairly normal distribution

In [12]:
print('Kurtosis of Months on book features is : {}'.format(data['Months_on_book'].kurt()))
Kurtosis of Months on book features is : 0.40010012019986707

It is fairly normal, based on value of Kurtosis

In [13]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=data['Total_Relationship_Count'],name='Total no. of products Box Plot',boxmean=True)
tr2=go.Histogram(x=data['Total_Relationship_Count'],name='Total no. of products Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of Total no. of products held by the customer")
fig.show()

Total No of Products is slightly left skewed mean being 3.8 and median is 4

In [14]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=data['Months_Inactive_12_mon'],name='number of months inactive Box Plot',boxmean=True)
tr2=go.Histogram(x=data['Months_Inactive_12_mon'],name='number of months inactive Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of the number of months inactive in the last 12 months")
fig.show()

Number of months inactive is slightly right skewed with mean of 2.341 months and median of 2 months

In [ ]:
 
In [15]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=data['Credit_Limit'],name='Credit_Limit Box Plot',boxmean=True)
tr2=go.Histogram(x=data['Credit_Limit'],name='Credit_Limit Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of the Credit Limit")
fig.show()

Credit limit is heavily right skewed with mean around 8.6k and median around 4.5k

In [16]:
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=data['Total_Trans_Amt'],name='Total_Trans_Amt Box Plot',boxmean=True)
tr2=go.Histogram(x=data['Total_Trans_Amt'],name='Total_Trans_Amt Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of the Total Transaction Amount (Last 12 months)")
fig.show()

Total transaction amount is heavily right skewed with mean=4million transactions and median = 3.8 million transactions

In [17]:
ex.pie(data,names='Attrition_Flag',title='Proportion of churn vs not churn customers',hole=0.33)

We have 83.9% as customers currently and 16.1% of customers have attrited.

In [18]:
data
Out[18]:
CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count Education_Level Marital_Status Income_Category Card_Category Months_on_book Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
0 768805383 Existing Customer 45 M 3 High School Married $60K - $80K Blue 39 5 1 3 12691.000 777 11914.000 1.335 1144 42 1.625 0.061
1 818770008 Existing Customer 49 F 5 Graduate Single Less than $40K Blue 44 6 1 2 8256.000 864 7392.000 1.541 1291 33 3.714 0.105
2 713982108 Existing Customer 51 M 3 Graduate Married $80K - $120K Blue 36 4 1 0 3418.000 0 3418.000 2.594 1887 20 2.333 0.000
3 769911858 Existing Customer 40 F 4 High School NaN Less than $40K Blue 34 3 4 1 3313.000 2517 796.000 1.405 1171 20 2.333 0.760
4 709106358 Existing Customer 40 M 3 Uneducated Married $60K - $80K Blue 21 5 1 0 4716.000 0 4716.000 2.175 816 28 2.500 0.000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10122 772366833 Existing Customer 50 M 2 Graduate Single $40K - $60K Blue 40 3 2 3 4003.000 1851 2152.000 0.703 15476 117 0.857 0.462
10123 710638233 Attrited Customer 41 M 2 NaN Divorced $40K - $60K Blue 25 4 2 3 4277.000 2186 2091.000 0.804 8764 69 0.683 0.511
10124 716506083 Attrited Customer 44 F 1 High School Married Less than $40K Blue 36 5 3 4 5409.000 0 5409.000 0.819 10291 60 0.818 0.000
10125 717406983 Attrited Customer 30 M 2 Graduate NaN $40K - $60K Blue 36 4 3 3 5281.000 0 5281.000 0.535 8395 62 0.722 0.000
10126 714337233 Attrited Customer 43 F 2 Graduate Married Less than $40K Silver 25 6 2 4 10388.000 1961 8427.000 0.703 10294 61 0.649 0.189

10127 rows × 21 columns

In [105]:
sns.pairplot(data, hue="Attrition_Flag")
Out[105]:
<seaborn.axisgrid.PairGrid at 0x2ce06140a90>
In [106]:
plt.figure(figsize=(15,7))
sns.heatmap(data.corr(),annot=True,vmin=-1,vmax=1,fmt='.2f',cmap='Spectral')
plt.show()

Total transaction amount and total transaction count are highly correlated with 0.81 followed by high correlation between month on book and customer age of 0.79.

In [135]:
sns.barplot(x=data['Attrition_Flag'],y=data['Credit_Limit'], hue= data['Gender'])
Out[135]:
<matplotlib.axes._subplots.AxesSubplot at 0x2ce213afc10>
In [125]:
### Function to plot distributions and Boxplots of customers

def plot(x,target='Attrition_Flag'):
    fig,axs = plt.subplots(2,2,figsize=(12,10))
    axs[0, 0].set_title(f'Distribution of {x}  of customer who attrited',fontsize=12,fontweight='bold')
    sns.distplot(data[(data[target] == 1)][x],ax=axs[0,0],color='teal')
    axs[0, 1].set_title(f"Distribution of {x}  of customer who didn't attrite",fontsize=12,fontweight='bold')
    sns.distplot(data[(data[target] == 0)][x],ax=axs[0,1],color='orange')
    axs[1,0].set_title(f'Boxplot of {x} w.r.t ProductTaken',fontsize=12,fontweight='bold')
    
    line = plt.Line2D((.1,.9),(.5,.5), color='grey', linewidth=1.5,linestyle='--')
    fig.add_artist(line)
   
    sns.boxplot(data[target],data[x],ax=axs[1,0],palette='gist_rainbow',showmeans=True)
    axs[1,1].set_title(f'Boxplot of {x} w.r.t Personal Loan - Without outliers',fontsize=12,fontweight='bold')
    sns.boxplot(data[target],data[x],ax=axs[1,1],showfliers=False,palette='gist_rainbow',showmeans=True) #turning off outliers from boxplot
    plt.tight_layout(pad=4)
    plt.show()
In [136]:
sns.barplot(x=data['Attrition_Flag'],y=data['Total_Trans_Ct'], hue= data['Gender'])
Out[136]:
<matplotlib.axes._subplots.AxesSubplot at 0x2ce21403c70>

Existing customers had higher transactions compared to attrited customers

In [137]:
sns.barplot(x=data['Attrition_Flag'],y=data['Total_Trans_Amt'], hue= data['Gender'])
Out[137]:
<matplotlib.axes._subplots.AxesSubplot at 0x2ce213b5760>

Existing customers had higher transactions amount compared to attrited customers

In [139]:
sns.barplot(x=data['Attrition_Flag'],y=data['Total_Ct_Chng_Q4_Q1'], hue= data['Gender'])
Out[139]:
<matplotlib.axes._subplots.AxesSubplot at 0x2ce2141a310>

Existing customers had higher Total_Ct_ChngQ4_Q1

Existing female customers had higher Total_CtChng_Q4_Q1 amount compared to existing Males

Attrited male customers had higher Total_CtChng_Q4_Q1 amount compared to attrited females

In [140]:
sns.barplot(x=data['Attrition_Flag'],y=data['Total_Amt_Chng_Q4_Q1'], hue= data['Gender'])
Out[140]:
<matplotlib.axes._subplots.AxesSubplot at 0x2ce21524790>

Existing customers had higher Total_amt_ChngQ4_Q1

Existing female and male customers had same Total_amt_Chng_Q4_Q1

Attrited male customers had higher Total_amt_Chng_Q4_Q1 amount compared to attrited females

In [141]:
sns.barplot(x=data['Attrition_Flag'],y=data['Total_Revolving_Bal'], hue= data['Gender'])
Out[141]:
<matplotlib.axes._subplots.AxesSubplot at 0x2ce2158ca90>

Existing customers had higher Total_Revolving_Bal

Existing male customers had slightly higher Total_Revolving_Bal compared to female customers.

Attrited male and female customers had same Total_Revolving_Bal.

In [128]:
plot('Customer_Age')

Data Preprocessing

We can replace Attrition_Flag into binary and also gender variable can be converted to binary

In [19]:
data.Attrition_Flag = data.Attrition_Flag.replace({'Attrited Customer':1,'Existing Customer':0}) # replacing values of attrition flag
In [20]:
data.Gender = data.Gender.replace({'F':1,'M':0}) # replacing values of Gender
In [21]:
data.isnull().sum() # Finding sum of all null values across all variables
Out[21]:
CLIENTNUM                      0
Attrition_Flag                 0
Customer_Age                   0
Gender                         0
Dependent_count                0
Education_Level             1519
Marital_Status               749
Income_Category                0
Card_Category                  0
Months_on_book                 0
Total_Relationship_Count       0
Months_Inactive_12_mon         0
Contacts_Count_12_mon          0
Credit_Limit                   0
Total_Revolving_Bal            0
Avg_Open_To_Buy                0
Total_Amt_Chng_Q4_Q1           0
Total_Trans_Amt                0
Total_Trans_Ct                 0
Total_Ct_Chng_Q4_Q1            0
Avg_Utilization_Ratio          0
dtype: int64

We have null values across variables Education_Level and MArital_Status

In [22]:
data.info() # to check for  data types of various variables
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CLIENTNUM                 10127 non-null  int64  
 1   Attrition_Flag            10127 non-null  int64  
 2   Customer_Age              10127 non-null  int64  
 3   Gender                    10127 non-null  int64  
 4   Dependent_count           10127 non-null  int64  
 5   Education_Level           8608 non-null   object 
 6   Marital_Status            9378 non-null   object 
 7   Income_Category           10127 non-null  object 
 8   Card_Category             10127 non-null  object 
 9   Months_on_book            10127 non-null  int64  
 10  Total_Relationship_Count  10127 non-null  int64  
 11  Months_Inactive_12_mon    10127 non-null  int64  
 12  Contacts_Count_12_mon     10127 non-null  int64  
 13  Credit_Limit              10127 non-null  float64
 14  Total_Revolving_Bal       10127 non-null  int64  
 15  Avg_Open_To_Buy           10127 non-null  float64
 16  Total_Amt_Chng_Q4_Q1      10127 non-null  float64
 17  Total_Trans_Amt           10127 non-null  int64  
 18  Total_Trans_Ct            10127 non-null  int64  
 19  Total_Ct_Chng_Q4_Q1       10127 non-null  float64
 20  Avg_Utilization_Ratio     10127 non-null  float64
dtypes: float64(5), int64(12), object(4)
memory usage: 1.6+ MB
In [23]:
data.nunique() # To check for unique values across various variables
Out[23]:
CLIENTNUM                   10127
Attrition_Flag                  2
Customer_Age                   45
Gender                          2
Dependent_count                 6
Education_Level                 6
Marital_Status                  3
Income_Category                 6
Card_Category                   4
Months_on_book                 44
Total_Relationship_Count        6
Months_Inactive_12_mon          7
Contacts_Count_12_mon           7
Credit_Limit                 6205
Total_Revolving_Bal          1974
Avg_Open_To_Buy              6813
Total_Amt_Chng_Q4_Q1         1158
Total_Trans_Amt              5033
Total_Trans_Ct                126
Total_Ct_Chng_Q4_Q1           830
Avg_Utilization_Ratio         964
dtype: int64
In [24]:
data = data.drop(['CLIENTNUM'], axis=1) # Let Drop Customer Id Column
In [25]:
data= data.dropna(axis=0, how='any') # Drop any other null values

I have dropped Clientnumber as it is not of any specific importance to analyse and derive ur best model.

I have also dropped drop all null values . We may have lost some usefull input but I thought it would be better to drop

In [26]:
data['Attrition_Flag'] = data['Attrition_Flag'].astype('category')
data['Gender'] = data['Gender'].astype('category')
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7973 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Attrition_Flag            7973 non-null   category
 1   Customer_Age              7973 non-null   int64   
 2   Gender                    7973 non-null   category
 3   Dependent_count           7973 non-null   int64   
 4   Education_Level           7973 non-null   object  
 5   Marital_Status            7973 non-null   object  
 6   Income_Category           7973 non-null   object  
 7   Card_Category             7973 non-null   object  
 8   Months_on_book            7973 non-null   int64   
 9   Total_Relationship_Count  7973 non-null   int64   
 10  Months_Inactive_12_mon    7973 non-null   int64   
 11  Contacts_Count_12_mon     7973 non-null   int64   
 12  Credit_Limit              7973 non-null   float64 
 13  Total_Revolving_Bal       7973 non-null   int64   
 14  Avg_Open_To_Buy           7973 non-null   float64 
 15  Total_Amt_Chng_Q4_Q1      7973 non-null   float64 
 16  Total_Trans_Amt           7973 non-null   int64   
 17  Total_Trans_Ct            7973 non-null   int64   
 18  Total_Ct_Chng_Q4_Q1       7973 non-null   float64 
 19  Avg_Utilization_Ratio     7973 non-null   float64 
dtypes: category(2), float64(5), int64(9), object(4)
memory usage: 1.2+ MB
In [27]:
data = pd.get_dummies(data, columns=['Education_Level','Marital_Status','Income_Category','Card_Category','Gender']) #create dummy variables

I have converted few variables to categorical variables and then got dummies for one hot encoding

In [28]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7973 entries, 0 to 10126
Data columns (total 36 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   Attrition_Flag                  7973 non-null   category
 1   Customer_Age                    7973 non-null   int64   
 2   Dependent_count                 7973 non-null   int64   
 3   Months_on_book                  7973 non-null   int64   
 4   Total_Relationship_Count        7973 non-null   int64   
 5   Months_Inactive_12_mon          7973 non-null   int64   
 6   Contacts_Count_12_mon           7973 non-null   int64   
 7   Credit_Limit                    7973 non-null   float64 
 8   Total_Revolving_Bal             7973 non-null   int64   
 9   Avg_Open_To_Buy                 7973 non-null   float64 
 10  Total_Amt_Chng_Q4_Q1            7973 non-null   float64 
 11  Total_Trans_Amt                 7973 non-null   int64   
 12  Total_Trans_Ct                  7973 non-null   int64   
 13  Total_Ct_Chng_Q4_Q1             7973 non-null   float64 
 14  Avg_Utilization_Ratio           7973 non-null   float64 
 15  Education_Level_College         7973 non-null   uint8   
 16  Education_Level_Doctorate       7973 non-null   uint8   
 17  Education_Level_Graduate        7973 non-null   uint8   
 18  Education_Level_High School     7973 non-null   uint8   
 19  Education_Level_Post-Graduate   7973 non-null   uint8   
 20  Education_Level_Uneducated      7973 non-null   uint8   
 21  Marital_Status_Divorced         7973 non-null   uint8   
 22  Marital_Status_Married          7973 non-null   uint8   
 23  Marital_Status_Single           7973 non-null   uint8   
 24  Income_Category_$120K +         7973 non-null   uint8   
 25  Income_Category_$40K - $60K     7973 non-null   uint8   
 26  Income_Category_$60K - $80K     7973 non-null   uint8   
 27  Income_Category_$80K - $120K    7973 non-null   uint8   
 28  Income_Category_Less than $40K  7973 non-null   uint8   
 29  Income_Category_abc             7973 non-null   uint8   
 30  Card_Category_Blue              7973 non-null   uint8   
 31  Card_Category_Gold              7973 non-null   uint8   
 32  Card_Category_Platinum          7973 non-null   uint8   
 33  Card_Category_Silver            7973 non-null   uint8   
 34  Gender_0                        7973 non-null   uint8   
 35  Gender_1                        7973 non-null   uint8   
dtypes: category(1), float64(5), int64(9), uint8(21)
memory usage: 1.1 MB
In [29]:
data.isnull().sum()
Out[29]:
Attrition_Flag                    0
Customer_Age                      0
Dependent_count                   0
Months_on_book                    0
Total_Relationship_Count          0
Months_Inactive_12_mon            0
Contacts_Count_12_mon             0
Credit_Limit                      0
Total_Revolving_Bal               0
Avg_Open_To_Buy                   0
Total_Amt_Chng_Q4_Q1              0
Total_Trans_Amt                   0
Total_Trans_Ct                    0
Total_Ct_Chng_Q4_Q1               0
Avg_Utilization_Ratio             0
Education_Level_College           0
Education_Level_Doctorate         0
Education_Level_Graduate          0
Education_Level_High School       0
Education_Level_Post-Graduate     0
Education_Level_Uneducated        0
Marital_Status_Divorced           0
Marital_Status_Married            0
Marital_Status_Single             0
Income_Category_$120K +           0
Income_Category_$40K - $60K       0
Income_Category_$60K - $80K       0
Income_Category_$80K - $120K      0
Income_Category_Less than $40K    0
Income_Category_abc               0
Card_Category_Blue                0
Card_Category_Gold                0
Card_Category_Platinum            0
Card_Category_Silver              0
Gender_0                          0
Gender_1                          0
dtype: int64

Rechecked all variables for data type and null values

Data Preparation for Modeling

Split Data

In [30]:
df = data.copy() # Let us copy data into dataframe
In [31]:
X = df.drop(["Attrition_Flag"], axis=1) #drop the dependent variable 
y = df["Attrition_Flag"] # defining dependent variable
In [32]:
# Splitting data into training, validation and test sets:
# first we split data into 2 parts, say temporary and test

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y
)

# then we split the temporary set into train and validation

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(4783, 35) (1595, 35) (1595, 35)
In [33]:
models = []  # Empty list to store all the models

# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))

results = []  # Empty list to store all model's CV scores
names = []  # Empty list to store name of the models
score = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
    scoring = "recall"
    kfold = StratifiedKFold(
        n_splits=5, shuffle=True, random_state=1
    )  # Setting number of splits equal to 5
    cv_result = cross_val_score(
        estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
    )
    results.append(cv_result)
    names.append(name)
    print("{}: {}".format(name, cv_result.mean() * 100))

print("\n" "Validation Performance:" "\n")

for name, model in models:
    model.fit(X_train, y_train)
    scores = recall_score(y_val, model.predict(X_val))
    score.append(scores)
    print("{}: {}".format(name, scores))
Cross-Validation Performance:

Bagging: 80.77350993377485
Random forest: 70.69403973509935
GBM: 83.95496688741721
Adaboost: 84.21810154525386
Xgboost: 86.47328918322295
dtree: 77.18763796909492

Validation Performance:

Bagging: 0.8452380952380952
Random forest: 0.7420634920634921
GBM: 0.8333333333333334
Adaboost: 0.8174603174603174
Xgboost: 0.8809523809523809
dtree: 0.8293650793650794

When we check validation and cross validation performance across training data we can see that Xgboost gives best performance

In [34]:
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure()

fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)

plt.boxplot(results)
ax.set_xticklabels(names)

plt.show()

Xgboost, Adaboost and GBM algorithm perform quiet well as we look into Boxplot

Model evaluation criterion:

What does a bank want?

A bank wants to maximize customers and reduce attrition
Type 1 error is predicting someone has attrited when still he/she is customer of bank
Type 2 error is predicting someone is customer when he/she has attrited
In our context type 2 error would be of more importance to us.
Recall - It gives the ratio of True positives to Actual positives, so high Recall implies low false negatives, i.e. low chances of type 2 error.
In [35]:
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
    """
    Function to compute different metrics to check classification model performance

    model: classifier
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {
            "Accuracy": acc,
            "Recall": recall,
            "Precision": precision,
            "F1": f1,
        },
        index=[0],
    )

    return df_perf
In [36]:
def confusion_matrix_sklearn(model, predictors, target):
    """
    To plot the confusion_matrix with percentages

    model: classifier
    predictors: independent variables
    target: dependent variable
    """
    y_pred = model.predict(predictors)
    cm = confusion_matrix(target, y_pred)
    labels = np.asarray(
        [
            ["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
            for item in cm.flatten()
        ]
    ).reshape(2, 2)

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=labels, fmt="")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")

Logistic Regression

In [37]:
lr = LogisticRegression(random_state=1)
lr.fit(X_train, y_train)
Out[37]:
LogisticRegression(random_state=1)

Let's evaluate the model performance by using KFold and cross_val_score

K-Folds cross-validation provides dataset indices to split data into train/validation sets. Split dataset into k consecutive stratified folds (without shuffling by default). Each fold is then used once as validation while the k - 1 remaining folds form the training set.

In [38]:
scoring = "recall"
kfold = StratifiedKFold(
    n_splits=5, shuffle=True, random_state=1
)  # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
    estimator=lr, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()

Performance on training set varies between 0.41 to 0.45 recall.

Let's check the performance on validation data.

In [39]:
# Calculating different metrics on train set
log_reg_model_train_perf = model_performance_classification_sklearn(
    lr, X_train, y_train
)
print("Training performance:")
log_reg_model_train_perf
Training performance:
Out[39]:
Accuracy Recall Precision F1
0 0.881 0.472 0.677 0.556
In [40]:
# Calculating different metrics on validation set
log_reg_model_val_perf = model_performance_classification_sklearn(lr, X_val, y_val)
print("Validation performance:")
log_reg_model_val_perf
Validation performance:
Out[40]:
Accuracy Recall Precision F1
0 0.877 0.468 0.656 0.546
In [41]:
# creating confusion matrix
confusion_matrix_sklearn(lr, X_val, y_val)
Logistic Regression has given a generalized performance on training and validation set.
Let's try oversampling (increase training data) to see if the model performance can be improved.

Logistic regression on over sampled data

Oversampling train data using SMOTE

In [42]:
print("Before UpSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))

sm = SMOTE(
    sampling_strategy=1, k_neighbors=5, random_state=1
)  # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)


print("After UpSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
Before UpSampling, counts of label 'Yes': 754
Before UpSampling, counts of label 'No': 4029 

After UpSampling, counts of label 'Yes': 4029
After UpSampling, counts of label 'No': 4029 

In [43]:
log_reg_over = LogisticRegression(random_state=1)

# Training the basic logistic regression model with training set
log_reg_over.fit(X_train_over, y_train_over)
Out[43]:
LogisticRegression(random_state=1)
In [44]:
scoring = "recall"
kfold = StratifiedKFold(
    n_splits=5, shuffle=True, random_state=1
)  # Setting number of splits equal to 5
cv_result_over = cross_val_score(
    estimator=log_reg_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
Performance on training set varies between 0.78 to 0.785 recall
Let's check the performance on Validation set
In [45]:
# Calculating different metrics on train set
log_reg_over_train_perf = model_performance_classification_sklearn(
    log_reg_over, X_train_over, y_train_over
)
print("Training performance:")
log_reg_over_train_perf
Training performance:
Out[45]:
Accuracy Recall Precision F1
0 0.817 0.813 0.821 0.817
In [46]:
# Calculating different metrics on validation set
log_reg_over_val_perf = model_performance_classification_sklearn(
    log_reg_over, X_val, y_val
)
print("validation performance:")
log_reg_over_val_perf
validation performance:
Out[46]:
Accuracy Recall Precision F1
0 0.825 0.794 0.468 0.589
In [47]:
# creating confusion matrix
confusion_matrix_sklearn(log_reg_over, X_val, y_val)
Performance on the training set and validation set improved with good recall but low f1 score.
Lets try:
a) Undersampling the train to handle the imbalance between classes and check the model performance
b) Regularization to see if overfitting can be reduced

Undersampling train data using RandomUnderSampler

In [48]:
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
In [49]:
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))

print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))

print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Under Sampling, counts of label 'Yes': 754
Before Under Sampling, counts of label 'No': 4029 

After Under Sampling, counts of label 'Yes': 754
After Under Sampling, counts of label 'No': 754 

After Under Sampling, the shape of train_X: (1508, 35)
After Under Sampling, the shape of train_y: (1508,) 

Logistic Regression on undersampled data

In [50]:
log_reg_under = LogisticRegression(random_state=1)
log_reg_under.fit(X_train_un, y_train_un)
Out[50]:
LogisticRegression(random_state=1)

Let's evaluate the model performance by using KFold and cross_val_score

In [51]:
scoring = "recall"
kfold = StratifiedKFold(
    n_splits=5, shuffle=True, random_state=1
)  # Setting number of splits equal to 5
cv_result_under = cross_val_score(
    estimator=log_reg_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
Performance of model on training set varies between 0.78 to 0.82, with undersampled data
Let's check the performance on the validation set.
In [52]:
# Calculating different metrics on train set
log_reg_under_train_perf = model_performance_classification_sklearn(
    log_reg_under, X_train_un, y_train_un
)
print("Training performance:")
log_reg_under_train_perf
Training performance:
Out[52]:
Accuracy Recall Precision F1
0 0.790 0.777 0.798 0.788
In [53]:
# Calculating different metrics on validation set
log_reg_under_val_perf = model_performance_classification_sklearn(
    log_reg_under, X_val, y_val
)
print("Validation performance:")
log_reg_under_val_perf
Validation performance:
Out[53]:
Accuracy Recall Precision F1
0 0.793 0.806 0.419 0.552
In [54]:
# creating confusion matrix
confusion_matrix_sklearn(log_reg_under, X_val, y_val)
Performance on the training set and validation set is similar to oversampling with good recall but low f1 score.

Regularization

In [55]:
# Choose the type of classifier.
lr_estimator = LogisticRegression(random_state=1, solver="saga")

# Grid of parameters to choose from
parameters = {"C": np.arange(0.1, 1.1, 0.1)}

# Run the grid search
grid_obj = GridSearchCV(lr_estimator, parameters, scoring="recall")
grid_obj = grid_obj.fit(X_train_over, y_train_over)

# Set the clf to the best combination of parameters
lr_estimator = grid_obj.best_estimator_

# Fit the best algorithm to the data.
lr_estimator.fit(X_train_over, y_train_over)
Out[55]:
LogisticRegression(C=0.1, random_state=1, solver='saga')
In [56]:
# Calculating different metrics on train set
log_reg_reg_train_perf = model_performance_classification_sklearn(
    lr_estimator, X_train_over, y_train_over
)
print("Training performance:")
log_reg_reg_train_perf
Training performance:
Out[56]:
Accuracy Recall Precision F1
0 0.677 0.508 0.768 0.612
In [57]:
# Calculating different metrics on validation set
log_reg_reg_val_perf = model_performance_classification_sklearn(
    lr_estimator, X_val, y_val
)
print("Validation performance:")
log_reg_reg_val_perf
Validation performance:
Out[57]:
Accuracy Recall Precision F1
0 0.813 0.500 0.423 0.458
In [58]:
# creating confusion matrix
confusion_matrix_sklearn(lr_estimator, X_val, y_val)
Performance on the training set and validation set using regularisation is having low f1 and recall score.
Let's check with grid serchcv, randomsearchcv, gridsearch and randomsearch cv with xgboost

Decision Tree

GridSearchCV

In [59]:
# Creating pipeline
model = DecisionTreeClassifier(random_state=1)

# Parameter grid to pass in GridSearchCV
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 4, 5, None],
    "min_samples_split": [2, 4, 7, 10, 15],
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5)

# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)

print(
    "Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
Best Parameters:{'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2} 
Score: 0.7878145695364239
In [60]:
# Creating new pipeline with best parameters
dtree_tuned1 = DecisionTreeClassifier(
    random_state=1, criterion="entropy", max_depth=None, min_samples_split=2
)

# Fit the model on training data
dtree_tuned1.fit(X_train, y_train)
Out[60]:
DecisionTreeClassifier(criterion='entropy', random_state=1)
In [61]:
# Calculating different metrics on train set
dtree_grid_train = model_performance_classification_sklearn(
    dtree_tuned1, X_train, y_train
)
print("Training performance:")
dtree_grid_train
Training performance:
Out[61]:
Accuracy Recall Precision F1
0 1.000 1.000 1.000 1.000
In [62]:
# Calculating different metrics on validation set
dtree_grid_val = model_performance_classification_sklearn(dtree_tuned1, X_val, y_val)
print("Validation performance:")
dtree_grid_val
Validation performance:
Out[62]:
Accuracy Recall Precision F1
0 0.937 0.829 0.786 0.807
The validation recall has same performance to the validation recall on model with default parameters
The tuned decision tree model is overfitting the training data
The validation recall is still just ~83%
In [63]:
# creating confusion matrix
confusion_matrix_sklearn(dtree_tuned1, X_val, y_val)

RandomizedSearchCV

In [64]:
# Creating pipeline
model = DecisionTreeClassifier(random_state=1)

# Parameter grid to pass in RandomizedSearchCV
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 4, 5, None],
    "min_samples_split": [2, 4, 7, 10, 15],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

# Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=20,
    scoring=scorer,
    cv=5,
    random_state=1,
)

# Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train, y_train)

print(
    "Best parameters are {} with CV score={}:".format(
        randomized_cv.best_params_, randomized_cv.best_score_
    )
)
Best parameters are {'min_samples_split': 2, 'max_depth': None, 'criterion': 'entropy'} with CV score=0.7878145695364239:
In [65]:
# Creating new pipeline with best parameters
dtree_tuned2 = DecisionTreeClassifier(
    random_state=1, criterion="entropy", max_depth=None, min_samples_split=2
)

# Fit the model on training data
dtree_tuned2.fit(X_train, y_train)
Out[65]:
DecisionTreeClassifier(criterion='entropy', random_state=1)
In [66]:
# Calculating different metrics on train set
dtree_random_train = model_performance_classification_sklearn(
    dtree_tuned2, X_train, y_train
)
print("Training performance:")
dtree_random_train
Training performance:
Out[66]:
Accuracy Recall Precision F1
0 1.000 1.000 1.000 1.000
In [67]:
# Calculating different metrics on validation set
dtree_random_val = model_performance_classification_sklearn(dtree_tuned2, X_val, y_val)
print("Validation performance:")
dtree_random_val
Validation performance:
Out[67]:
Accuracy Recall Precision F1
0 0.937 0.829 0.786 0.807
In [68]:
# creating confusion matrix
confusion_matrix_sklearn(dtree_tuned1, X_val, y_val)
We reduced the number of iterations to only 20 but two out of the three parameters are the same as what we got from the grid search.
The validation recall is almost same as Gridsearchcv recall
The recall and accuracy are slightly less but still similar to the results for the decision tree model tuned with GridSearchCV is overfitting the training data

XGBOOST

GridsearchCV

In [69]:
%%time 

#defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')

#Parameter grid to pass in GridSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
            'scale_pos_weight':[2,5,10],
            'learning_rate':[0.01,0.1,0.2,0.05],
            'gamma':[0,1,3,5],
            'subsample':[0.8,0.9,1],
            'max_depth':np.arange(1,5,1),
            'reg_lambda':[5,10]}


# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

#Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1, verbose= 2)

#Fitting parameters in GridSeachCV
grid_cv.fit(X_train,y_train)

print("Best parameters are {} with CV score={}:" .format(grid_cv.best_params_,grid_cv.best_score_))
Fitting 5 folds for each of 2304 candidates, totalling 11520 fits
Best parameters are {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 50, 'reg_lambda': 5, 'scale_pos_weight': 10, 'subsample': 0.9} with CV score=0.9787902869757176:
Wall time: 54min 29s
In [70]:
# building model with best parameters
xgb_tuned1 = XGBClassifier(
    random_state=1,
    n_estimators=50,
    scale_pos_weight=10,
    subsample=0.9,
    learning_rate=0.01,
    gamma=0,
    eval_metric="logloss",
    reg_lambda=5,
    max_depth=2,
)

# Fit the model on training data
xgb_tuned1.fit(X_train, y_train)
Out[70]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=2, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=50, n_jobs=4,
              num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=5,
              scale_pos_weight=10, subsample=0.9, tree_method='exact',
              validate_parameters=1, verbosity=None)
In [71]:
# Calculating different metrics on train set
xgboost_grid_train = model_performance_classification_sklearn(
    xgb_tuned1, X_train, y_train
)
print("Training performance:")
xgboost_grid_train
Training performance:
Out[71]:
Accuracy Recall Precision F1
0 0.645 0.980 0.305 0.465
In [72]:
# Calculating different metrics on validation set
xgboost_grid_val = model_performance_classification_sklearn(xgb_tuned1, X_val, y_val)
print("Validation performance:")
xgboost_grid_val
Validation performance:
Out[72]:
Accuracy Recall Precision F1
0 0.652 0.976 0.309 0.470
In [73]:
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned1, X_val, y_val)
The validation recall has increased significantly compared to the result from cross-validation with default parameters.
The model has reduced overfitting significantly
The model can identify to recall percent of ~97% on validation data

RandomizedSearchCV

In [74]:
%%time

# defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')

# Parameter grid to pass in RandomizedSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
            'scale_pos_weight':[2,5,10],
            'learning_rate':[0.01,0.1,0.2,0.05],
            'gamma':[0,1,3,5],
            'subsample':[0.8,0.9,1],
            'max_depth':np.arange(1,5,1),
            'reg_lambda':[5,10]}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

#Calling RandomizedSearchCV
xgb_tuned2 = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)

#Fitting parameters in RandomizedSearchCV
xgb_tuned2.fit(X_train,y_train)

print("Best parameters are {} with CV score={}:" .format(xgb_tuned2.best_params_,xgb_tuned2.best_score_))
Best parameters are {'subsample': 0.8, 'scale_pos_weight': 10, 'reg_lambda': 10, 'n_estimators': 50, 'max_depth': 2, 'learning_rate': 0.1, 'gamma': 0} with CV score=0.9695011037527594:
Wall time: 57.5 s
In [75]:
# building model with best parameters
xgb_tuned2 = XGBClassifier(
    random_state=1,
    n_estimators=50,
    scale_pos_weight=10,
    gamma=0,
    subsample=0.8,
    learning_rate=0.1,
    eval_metric="logloss",
    max_depth=2,
    reg_lambda=10,
)
# Fit the model on training data
xgb_tuned2.fit(X_train, y_train)
Out[75]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=2, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=50, n_jobs=4,
              num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=10,
              scale_pos_weight=10, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbosity=None)
In [76]:
# Calculating different metrics on train set
xgboost_random_train = model_performance_classification_sklearn(
    xgb_tuned2, X_train, y_train
)
print("Training performance:")
xgboost_random_train
Training performance:
Out[76]:
Accuracy Recall Precision F1
0 0.862 0.988 0.534 0.693
In [77]:
# Calculating different metrics on validation set
xgboost_random_val = model_performance_classification_sklearn(xgb_tuned2, X_val, y_val)
print("Validation performance:")
xgboost_random_val
Validation performance:
Out[77]:
Accuracy Recall Precision F1
0 0.861 0.984 0.533 0.692
In [78]:
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned2, X_val, y_val)
We reduced the number of iterations to only 20 but the model performance is very similar to the results for the xgboost model tuned with GridSearchCV

Comparing all models

In [79]:
# training performance comparison

models_train_comp_df = pd.concat(
    [
        log_reg_model_train_perf.T,
        log_reg_over_train_perf.T,
        log_reg_reg_train_perf.T,
        log_reg_under_train_perf.T,
        dtree_grid_train.T,
        dtree_random_train.T,
        xgboost_grid_train.T,
        xgboost_random_train.T,
    ],
    axis=1,
)
models_train_comp_df.columns = [
    "Logistic Regression",
    "Logistic Regression with oversampled data",
    "Regularised Logistic Regression",
    "Logistic Regression with undersampled data",
    "Decision Tree Tuned with Grid search",
    "Decision Tree Tuned with Random search",
    "Xgboost Tuned with Grid search",
    "Xgboost Tuned with Random Search",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
Out[79]:
Logistic Regression Logistic Regression with oversampled data Regularised Logistic Regression Logistic Regression with undersampled data Decision Tree Tuned with Grid search Decision Tree Tuned with Random search Xgboost Tuned with Grid search Xgboost Tuned with Random Search
Accuracy 0.881 0.817 0.677 0.790 1.000 1.000 0.645 0.862
Recall 0.472 0.813 0.508 0.777 1.000 1.000 0.980 0.988
Precision 0.677 0.821 0.768 0.798 1.000 1.000 0.305 0.534
F1 0.556 0.817 0.612 0.788 1.000 1.000 0.465 0.693
In [80]:
# Validation performance comparison

models_val_comp_df = pd.concat(
    [
        log_reg_model_val_perf.T,
        log_reg_over_val_perf.T,
        log_reg_reg_val_perf.T,
        log_reg_under_val_perf.T,
        dtree_grid_val.T,
        dtree_random_val.T,
        xgboost_grid_val.T,
        xgboost_random_val.T,
    ],
    axis=1,
)
models_val_comp_df.columns = [
    "Logistic Regression",
    "Logistic Regression with oversampled data",
    "Regularised Logistic Regression",
    "Logistic Regression with undersampled data",
    "Decision Tree Tuned with Grid search",
    "Decision Tree Tuned with Random search",
    "Xgboost Tuned with Grid search",
    "Xgboost Tuned with Random Search",
]
print("Validation performance comparison:")
models_val_comp_df
Validation performance comparison:
Out[80]:
Logistic Regression Logistic Regression with oversampled data Regularised Logistic Regression Logistic Regression with undersampled data Decision Tree Tuned with Grid search Decision Tree Tuned with Random search Xgboost Tuned with Grid search Xgboost Tuned with Random Search
Accuracy 0.877 0.825 0.813 0.793 0.937 0.937 0.652 0.861
Recall 0.468 0.794 0.500 0.806 0.829 0.829 0.976 0.984
Precision 0.656 0.468 0.423 0.419 0.786 0.786 0.309 0.533
F1 0.546 0.589 0.458 0.552 0.807 0.807 0.470 0.692
In [62]:
feature_names = X_train.columns
importances = xgb_tuned1.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()

When we compare and see various models. RandomisedsearchCv and Gridsearchcv with xgboost performed best.

Decession tree tuned with random seach and grid search gave us best F1 score

Total transaction count and Total transaction amount were extremely important features in knowing if customer would attriate

Total ct changeQ4_Q1, Total_Revolving_Bal and Total_Amt_Chg_Q4_Q1 were also other important features

The above features were good enough to get our model.

In [88]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7973 entries, 0 to 10126
Data columns (total 36 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   Attrition_Flag                  7973 non-null   category
 1   Customer_Age                    7973 non-null   int64   
 2   Dependent_count                 7973 non-null   int64   
 3   Months_on_book                  7973 non-null   int64   
 4   Total_Relationship_Count        7973 non-null   int64   
 5   Months_Inactive_12_mon          7973 non-null   int64   
 6   Contacts_Count_12_mon           7973 non-null   int64   
 7   Credit_Limit                    7973 non-null   float64 
 8   Total_Revolving_Bal             7973 non-null   int64   
 9   Avg_Open_To_Buy                 7973 non-null   float64 
 10  Total_Amt_Chng_Q4_Q1            7973 non-null   float64 
 11  Total_Trans_Amt                 7973 non-null   int64   
 12  Total_Trans_Ct                  7973 non-null   int64   
 13  Total_Ct_Chng_Q4_Q1             7973 non-null   float64 
 14  Avg_Utilization_Ratio           7973 non-null   float64 
 15  Education_Level_College         7973 non-null   uint8   
 16  Education_Level_Doctorate       7973 non-null   uint8   
 17  Education_Level_Graduate        7973 non-null   uint8   
 18  Education_Level_High School     7973 non-null   uint8   
 19  Education_Level_Post-Graduate   7973 non-null   uint8   
 20  Education_Level_Uneducated      7973 non-null   uint8   
 21  Marital_Status_Divorced         7973 non-null   uint8   
 22  Marital_Status_Married          7973 non-null   uint8   
 23  Marital_Status_Single           7973 non-null   uint8   
 24  Income_Category_$120K +         7973 non-null   uint8   
 25  Income_Category_$40K - $60K     7973 non-null   uint8   
 26  Income_Category_$60K - $80K     7973 non-null   uint8   
 27  Income_Category_$80K - $120K    7973 non-null   uint8   
 28  Income_Category_Less than $40K  7973 non-null   uint8   
 29  Income_Category_abc             7973 non-null   uint8   
 30  Card_Category_Blue              7973 non-null   uint8   
 31  Card_Category_Gold              7973 non-null   uint8   
 32  Card_Category_Platinum          7973 non-null   uint8   
 33  Card_Category_Silver            7973 non-null   uint8   
 34  Gender_0                        7973 non-null   uint8   
 35  Gender_1                        7973 non-null   uint8   
dtypes: category(1), float64(5), int64(9), uint8(21)
memory usage: 1.1 MB

Pipelines for productionizing the model

In [115]:
# creating a list of numerical variables
numerical_features = ["Total_Trans_Ct", "Total_Trans_Amt", "Total_Ct_Chng_Q4_Q1", "Total_Revolving_Bal", "Total_Amt_Chng_Q4_Q1","Education_Level","Mariatal"]

# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])





# handle_unknown = "ignore", allows model to handle any unknown category in the test data

# combining categorical transformer and numerical transformer using a column transformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
    ],
    remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
In [90]:
# Separating target variable and other variables
X = data.drop("Attrition_Flag", axis=1)
Y = data["Attrition_Flag"]
In [91]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(5581, 35) (2392, 35)
In [92]:
# Creating new pipeline with best parameters
model = Pipeline(
    steps=[
        ("pre", preprocessor),
        (
            "XGB",
            XGBClassifier(
                random_state=1,
                n_estimators=50,
                scale_pos_weight=10,
                subsample=0.8,
                learning_rate=0.1,
                gamma=0,
                eval_metric="logloss",
                reg_lambda=10,
                max_depth=2,
            ),
        ),
    ]
)
# Fit the model on training data
model.fit(X_train, y_train)
Out[92]:
Pipeline(steps=[('pre',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['Total_Trans_Ct',
                                                   'Total_Trans_Amt',
                                                   'Total_Ct_Chng_Q4_Q1',
                                                   'Total_Revolving_Bal',
                                                   'Total_Amt_Chng_Q4_Q1']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncod...
                               gamma=0, gpu_id=-1, importance_type='gain',
                               interaction_constraints='', learning_rate=0.1,
                               max_delta_step=0, max_depth=2,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=50,
                               n_jobs=4, num_parallel_tree=1, random_state=1,
                               reg_alpha=0, reg_lambda=10, scale_pos_weight=10,
                               subsample=0.8, tree_method='exact',
                               validate_parameters=1, verbosity=None))])

Conclusion and Insights

The best test recall is ~98% but the test precision is very low i.e ~53% at the same time. This means that the model is not good at identifying non-defaulter, therefore, the bank can lose opportunities of giving credit cards assuming the customers might attrite.

The model performance can be improved, especially in terms of precision and the bank can use use the model for new customers once desired level of model performance is achieved.

We saw in our analysis that women customers likely to take credit card more compared to men.

Graduates and high school graduates were more likely to take credit card.

Total_trans_Ct and Total_Trans_amt was extremely important to know if customer would attrite. We can get pattern of those customers who had similar Total_trans_Ct and give offers and flexiblity to stay back.

Bank can also target women graduates and highschool graduates to take credit card.

Male customer tend to have higher credit limit. Male customers can be lured to some extent by offering higher credit limit

Lowereing trend in total_transaction_ct, Total_Revolving_Bal and Lower Transaction_Amt customers might be the ones to attrite. Bank can stratergecally give offer and try to keep those customers

In [ ]: